{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import scipy as sp\n", "from scipy.stats import mode\n", "from sklearn import linear_model\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", "from sklearn import discriminant_analysis\n", "from sklearn.decomposition import PCA\n", "from sklearn import preprocessing\n", "from sklearn.neighbors import KNeighborsRegressor as KNN\n", "%matplotlib inline\n", "\n", "import pandas as pd\n", "import numpy as np\n", "import os, random" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def GetPandasFromFileCSV(path):\n", " return pd.read_csv(path, delimiter=',')\n", "\n", "def GetPandasFromFile(path, theSkipRow):\n", " return pd.read_csv(path, skiprows= theSkipRow , header=None)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(13049, 47)\n", " pixelPlant pixelPole pixelLake pixelRoad pixelGrass pixelWall \\\n", "0 0.0 0.000 60.99 2.671 0.0 2.116 \n", "1 0.0 0.004 34.12 0.217 0.0 4.409 \n", "2 0.0 0.000 0.00 0.000 0.0 0.000 \n", "\n", " pixelCar propertiesAsses pixelSea numCraigslistHouse ... pixelSky \\\n", "0 6.639 142585895 0.0 0 ... 18.16 \n", "1 22.560 173725104 0.0 0 ... 27.81 \n", "2 0.000 243090896 0.0 0 ... 0.00 \n", "\n", " Latitude Longitude Address Zip RoomType \\\n", "0 42.358550 -71.064780 37 Mount Vernon #4 Boston 02108 2108 3 \n", "1 42.356533 -71.070305 3 Byron St Boston 02108 2108 3 \n", "2 42.355400 -71.061510 3 Winter Pl Boston 02108 2108 2 \n", "\n", " Bathrooms SQFT SQM Price \n", "0 2.0 1425 132.386775 4250.0 \n", "1 3.5 2500 232.2575 9500.0 \n", "2 2.5 2250 209.03175 8500.0 \n", "\n", "[3 rows x 47 columns]\n", "['pixelPlant' 'pixelPole' 'pixelLake' 'pixelRoad' 'pixelGrass' 'pixelWall'\n", " 'pixelCar' 'propertiesAsses' 'pixelSea' 'numCraigslistHouse' 'pixelRiver'\n", " 'pixelBus' 'pixelCeiling' 'pixelPath' 'pixelBuilding' 'crime' 'pixelFence'\n", " 'walkSchool' 'walkMbta' 'energySiteEUI' 'pixelPerson' 'pixelTree'\n", " 'pixelVan' 'walkPark' 'walkUniversity' 'pixelSidewalk' 'pixelGround'\n", " 'pixelMountain' 'pixelPalmTree' 'pixelHouse' 'pixelBridge' 'pixelSign'\n", " 'pixelRailing' 'pixelField' 'pixelWindow' 'pixelGrandstand'\n", " 'numCraigslistRoom' 'pixelSky' 'Latitude' 'Longitude' 'Address' 'Zip'\n", " 'RoomType' 'Bathrooms' 'SQFT' 'SQM' 'Price']\n" ] } ], "source": [ "df = GetPandasFromFileCSV(\"[dataFinal]/_RentPriceTruliaMergeFinal.csv\")\n", "print df.shape\n", "print df.head(3)\n", "print df.columns.values" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "pixelPlant 0\n", "pixelPole 0\n", "pixelLake 0\n", "pixelRoad 0\n", "pixelGrass 0\n", "pixelWall 0\n", "pixelCar 0\n", "propertiesAsses 0\n", "pixelSea 0\n", "numCraigslistHouse 0\n", "pixelRiver 0\n", "pixelBus 0\n", "pixelCeiling 0\n", "pixelPath 0\n", "pixelBuilding 0\n", "crime 0\n", "pixelFence 0\n", "walkSchool 0\n", "walkMbta 0\n", "energySiteEUI 0\n", "pixelPerson 0\n", "pixelTree 0\n", "pixelVan 0\n", "walkPark 0\n", "walkUniversity 0\n", "pixelSidewalk 0\n", "pixelGround 0\n", "pixelMountain 0\n", "pixelPalmTree 0\n", "pixelHouse 0\n", "pixelBridge 0\n", "pixelSign 0\n", "pixelRailing 0\n", "pixelField 0\n", "pixelWindow 0\n", "pixelGrandstand 0\n", "numCraigslistRoom 0\n", "pixelSky 0\n", "Latitude 0\n", "Longitude 0\n", "Address 0\n", "Zip 0\n", "RoomType 995\n", "Bathrooms 125\n", "SQFT 8630\n", "SQM 0\n", "Price 27\n" ] } ], "source": [ "for col in df.columns:\n", " print col,len(df[df[col].isnull()])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "pixelPlant float64\n", "pixelPole float64\n", "pixelLake float64\n", "pixelRoad float64\n", "pixelGrass float64\n", "pixelWall float64\n", "pixelCar float64\n", "propertiesAsses int64\n", "pixelSea float64\n", "numCraigslistHouse int64\n", "pixelRiver float64\n", "pixelBus float64\n", "pixelCeiling float64\n", "pixelPath float64\n", "pixelBuilding float64\n", "crime int64\n", "pixelFence int64\n", "walkSchool int64\n", "walkMbta int64\n", "energySiteEUI float64\n", "pixelPerson float64\n", "pixelTree float64\n", "pixelVan float64\n", "walkPark int64\n", "walkUniversity int64\n", "pixelSidewalk float64\n", "pixelGround float64\n", "pixelMountain float64\n", "pixelPalmTree float64\n", "pixelHouse float64\n", "pixelBridge float64\n", "pixelSign float64\n", "pixelRailing float64\n", "pixelField float64\n", "pixelWindow float64\n", "pixelGrandstand float64\n", "numCraigslistRoom int64\n", "pixelSky float64\n", "Latitude float64\n", "Longitude float64\n", "Address object\n", "Zip int64\n", "RoomType float64\n", "Bathrooms float64\n", "SQFT float64\n", "SQM float64\n", "Price float64\n", "['propertiesAsses', 'numCraigslistHouse', 'crime', 'pixelFence', 'walkSchool', 'walkMbta', 'walkPark', 'walkUniversity', 'numCraigslistRoom', 'Zip']\n", "----------------------\n", "['Address']\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\EllieHan\\Anaconda2\\lib\\site-packages\\ipykernel\\__main__.py:1: FutureWarning: convert_objects is deprecated. Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.\n", " if __name__ == '__main__':\n" ] } ], "source": [ "data = df.convert_objects(convert_numeric=True)\n", "\n", "to_float = []\n", "to_encode = []\n", "for col in data.columns:\n", " if data[col].dtype =='object':\n", " to_encode.append(col);\n", " if data[col].dtype =='int64':\n", " to_float.append(col);\n", " print col,data[col].dtype\n", " \n", "print to_float\n", "print \"----------------------\"\n", "print to_encode\n", "\n", "for feature_name in to_float:\n", " data[feature_name] = data[feature_name].astype(float)\n", "\n", "def encode_categorical(array):\n", " if not array.dtype == np.dtype('float64'):\n", " return preprocessing.LabelEncoder().fit_transform(array) \n", " else:\n", " return array\n", " \n", "# Categorical columns for use in one-hot encoder\n", "categorical = (data.dtypes.values != np.dtype('float64'))\n", "\n", "# Encode all labels\n", "data = data.apply(encode_categorical)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " | pixelPlant | \n", "pixelPole | \n", "pixelLake | \n", "pixelRoad | \n", "pixelGrass | \n", "pixelWall | \n", "pixelCar | \n", "propertiesAsses | \n", "pixelSea | \n", "numCraigslistHouse | \n", "... | \n", "pixelSky | \n", "Latitude | \n", "Longitude | \n", "Address | \n", "Zip | \n", "RoomType | \n", "Bathrooms | \n", "SQFT | \n", "SQM | \n", "Price | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0.0 | \n", "0.000 | \n", "60.99 | \n", "2.671 | \n", "0.0 | \n", "2.116 | \n", "6.639 | \n", "142585895.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "18.16 | \n", "42.358550 | \n", "-71.064780 | \n", "4755 | \n", "2108.0 | \n", "3.0 | \n", "2.0 | \n", "1425.0 | \n", "132.386775 | \n", "4250.0 | \n", "
1 | \n", "0.0 | \n", "0.004 | \n", "34.12 | \n", "0.217 | \n", "0.0 | \n", "4.409 | \n", "22.560 | \n", "173725104.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "27.81 | \n", "42.356533 | \n", "-71.070305 | \n", "4058 | \n", "2108.0 | \n", "3.0 | \n", "3.5 | \n", "2500.0 | \n", "232.257500 | \n", "9500.0 | \n", "
2 | \n", "0.0 | \n", "0.000 | \n", "0.00 | \n", "0.000 | \n", "0.0 | \n", "0.000 | \n", "0.000 | \n", "243090896.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.00 | \n", "42.355400 | \n", "-71.061510 | \n", "4096 | \n", "2108.0 | \n", "2.0 | \n", "2.5 | \n", "2250.0 | \n", "209.031750 | \n", "8500.0 | \n", "
3 | \n", "0.0 | \n", "0.000 | \n", "37.65 | \n", "1.242 | \n", "0.0 | \n", "0.694 | \n", "0.020 | \n", "216929815.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "24.04 | \n", "42.356464 | \n", "-71.061760 | \n", "6249 | \n", "2108.0 | \n", "4.0 | \n", "2.0 | \n", "1325.0 | \n", "123.096475 | \n", "7200.0 | \n", "
4 | \n", "0.0 | \n", "0.000 | \n", "37.65 | \n", "1.242 | \n", "0.0 | \n", "0.694 | \n", "0.020 | \n", "216929815.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "24.04 | \n", "42.356464 | \n", "-71.061760 | \n", "6242 | \n", "2108.0 | \n", "2.0 | \n", "1.0 | \n", "750.0 | \n", "69.677250 | \n", "3800.0 | \n", "
5 rows × 47 columns
\n", "\n", " |
---|